library(ggpubr) # For rremove function
library(tidyverse)
library(tidyr)
library(broom)
library(gapminder)
library(dotwhisker)

### Loading Data
df_analysis <- bind_rows(read_csv("data/tw_df_mps.csv") |> mutate(data = "Twitter Dataset"), 
                         read_csv("data/fb_df_mps.csv") |> mutate(data = "Facebook Dataset"))
df_analysis$elected_via <- factor(df_analysis$elected_via, levels = c("Direct","List"))
df_analysis <- df_analysis |> mutate(incumbent = case_when(
  incumbent == 1 ~ "Incumbent",
  incumbent == 0 ~ "No Incumbent"
))

###Descriptives

common_fill_scale <- scale_fill_manual(name = "Data", 
                                       values = c("Facebook Dataset" = "#2066a8", 
                                                  "Twitter Dataset" = "#ae282c"))
df_analysis$l_mean<-log(df_analysis$mean_distance)
df_analysis$l_mean<-ifelse(is.infinite(df_analysis$l_mean),NA,df_analysis$l_mean)

df_analysis$service_l_mean<-log(df_analysis$mean_service_dist)
df_analysis$service_l_mean<-ifelse(is.infinite(df_analysis$mean_service_dist),NA,df_analysis$mean_service_dist)

all_mod_log_20 <-lm(l_mean ~ elected_via + incumbent+area_district +list_no+ gender+birthyear, df_analysis |> filter(data == "Twitter Dataset"))
all_mod_log_20 <-lm(l_mean ~ elected_via + incumbent+area_district +list_no+ gender+birthyear, df_analysis |> filter(data == "Twitter Dataset"))

df_analysis |> select(data)

# Create the plot
p1 <- ggplot(df_analysis, aes(x = mean_distance, fill = elected_via)) + 
  geom_density(alpha = 0.3) +
  labs(title = "Distribution of District Distance across Mandate Mode", y = "Distance") +
  facet_wrap(~data)+
  scale_fill_discrete(name = "Mandate Mode")+
  theme_bw() +
  theme(legend.position = "bottom") +
  rremove("x.title")

# p2 <- ggplot(df_analysis, aes(x = l_mean, fill = elected_via)) + 
#   geom_density(alpha = 0.3) +
#   labs(title = "Logged Distribution of Distance across Mandate Mode", y = "Log(Distance)") +
#   facet_wrap(~data)+
#   scale_fill_discrete(name = "Mandate Mode")+
#   theme_bw() +
#   theme(legend.position = "bottom") +
#   rremove("x.title")

p2 <- ggplot(df_analysis, aes(x = mean_service_dist, fill = elected_via)) +
  geom_density(alpha = 0.3) +
  labs(title = "Distribution of Service Distance across Mandate Mode", y = "Distance") +
  facet_wrap(~data)+
  scale_fill_discrete(name = "Mandate Mode")+
  theme_bw() +
  theme(legend.position = "bottom") +
  rremove("x.title")
# Arrange plots with a common legend
figure <- ggpubr::ggarrange(p1, p2, nrow = 2, labels = NULL, common.legend = TRUE, legend = "bottom")

# Annotate the figure
annotate_figure(
  figure,
  fig.lab.size = 24
)


ggsave("output/Distribution_logged.png", dpi = 1200, width = 9, height = 7)


# Descriptive Statistics Table
read_csv("data/tw_df_mps.csv") |>
  mutate(elected_via = case_when(
    elected_via == "List" ~ 0,
    elected_via == "Direct" ~1),
    gender = case_when(
      gender == "female" ~ 0,
      gender == "male" ~ 1
    )) |>
  mutate(dataset = "Twitter Dataset") -> df_stargazer_tw

read_csv("data/fb_df_mps.csv") |>
  mutate(elected_via = case_when(
    elected_via == "List" ~ 0,
    elected_via == "Direct" ~1),
    gender = case_when(
      gender == "female" ~ 0,
      gender == "male" ~ 1
    )) |>
  mutate(dataset = "Facebook Dataset")  -> df_stargazer_fb

#
combined_df <- bind_rows(df_stargazer_tw, df_stargazer_fb)
df_stargazer <- as.data.frame(combined_df)
columns <- c("mean_share_mp","mean_distance","mean_service_dist", "area_district" ,"mean_place_density", "elected_via", "gender", "birthyear", "list_no")
names <- c("District Match", "District Distance", "Service Deprivation","District Size","Mean Place Density","Mandate (1=Direct Candidate)", "Gender (1=Male)", "Birthyear", "List Number")
df_stargazer <- as.data.frame(df_stargazer)
df1 <- df_stargazer %>% dplyr::filter(dataset == "Twitter Dataset") |> select(columns)
df2 <- df_stargazer %>% dplyr::filter(dataset == "Facebook Dataset") |> select(columns)
stargazer::stargazer(
  df1,
  df2,
  covariate.labels = names,
  title = "Comparison of Two Datasets",
  column.labels = c("First Dataset", "Second Dataset"),
  align = TRUE,   single.row = T
)




# Identify numeric and categorical columns
numeric_cols <- c("area_district", "mean_place_density","list_no", "birthyear")
categorical_cols <- c("list_state", "gender","incumbent")

# Standardize numeric predictors
df_analysis <- df_analysis %>%
  mutate(across(all_of(numeric_cols), ~ scale(.) %>% as.numeric())) %>%
  mutate(across(all_of(categorical_cols), as.factor))

## H0: Direct Candidates are more likely to mention places within their constituency ----

#FB: without controls
tw_mod_00 <-lm(mean_share_mp ~ elected_via, df_analysis |> filter(data == "Twitter Dataset"))
#FB: with controls
tw_mod_01 <-lm(mean_share_mp ~ elected_via + incumbent+area_district+mean_place_density + list_no+ gender+birthyear, df_analysis |> filter(data == "Twitter Dataset"))

#FB: without controls
fb_mod_00 <-lm(mean_share_mp ~ elected_via, df_analysis |> filter(data == "Facebook Dataset"))
#FB: with controls
fb_mod_01 <-lm(mean_share_mp ~ elected_via+incumbent+area_district +mean_place_density + list_no+ gender+birthyear, df_analysis |> filter(data == "Facebook Dataset"))


terms <- c("elected_viaList", "incumbent","list_no","gendermale","birthyear","area_district" ,"mean_place_density","incumbentNo Incumbent")

tidy_model01 <- tidy(tw_mod_01)|> filter(term %in% terms)|> mutate(model = "Twitter Dataset", hypothesis = "H0: District Match")
tidy_model02 <- tidy(fb_mod_01)|> filter(term %in% terms)|> mutate(model = "Facebook Dataset",hypothesis = "H0: District Match")

tidy_models_h0 <- rbind(tidy_model01, tidy_model02) |> 
  mutate(term = case_when(
    term == "elected_viaList" ~ "Mandate Mode (List)",
    term == "list_no" ~ "List Number",
    term == "birthyear" ~ "Age",
    term == "incumbent" ~ "Incumbent",
    term == "mean_place_density"~ "Place Density",
    term == "area_district" ~ "District Size",
    term == "incumbentNo Incumbent" ~ "Incumbent (No)",
    term == "gendermale" ~ "Gender (Male)",
    TRUE ~ term
  ))


dwplot(tidy_models_h0, 
       dot_args = list( size = 3, shape = 18), 
       whisker_args = list(size = 1)) +
  geom_vline(xintercept = 0, linetype = "dashed") +
  facet_wrap(~hypothesis, scales = "free_x") +
  theme_bw() + 
  labs(x = "Estimate", y = "Terms") +
  theme(plot.title = element_text(face = "bold"),
        legend.position = "bottom",
        legend.background = element_rect(colour = "grey80"),
        legend.title = element_text(hjust = 0.5)) +
  scale_shape_discrete(name = "Data") +
  scale_colour_manual(name = "Data", 
                      values = c("Facebook Dataset" = "#1b9e77",   # green
                                 "Twitter Dataset" = "#d95f02"))   # orange


ggsave("output/coefficients_validation_h0.png", dpi = 1200, width = 7, height = 5)


######


#### Predicted Values

#H0
plot_data_mod_tw_01 <- ggeffects::ggpredict(tw_mod_01, terms = c("elected_via")) |> as_tibble() |> mutate(model = "Twitter Dataset", hypothesis = "H0: Constituency Saliencey")
plot_data_mod_fb_01 <- ggeffects::ggpredict(fb_mod_01, terms = c("elected_via")) |> as_tibble() |> mutate(model = "Facebook Dataset",hypothesis = "H0: Constituency Saliencey")

plot_data_mod_tw_01$x <- factor(plot_data_mod_tw_01$x, levels = c("Direct", "List"))
plot_data_mod_fb_01$x <- factor(plot_data_mod_fb_01$x, levels = c("Direct", "List"))


# Combine data
plot_data <- bind_rows(plot_data_mod_tw_01, plot_data_mod_fb_01)

# Factor the x variable for consistency
plot_data$x <- factor(plot_data$x, levels = c("Direct", "List"))

common_color_scale <- scale_color_manual(name = "Data", values = c("Facebook Dataset" = "#191819", "Twitter Dataset" = "#666566"))
common_shape_scale <- scale_shape_manual(name = "Data", values = c("Facebook Dataset" = 16, "Twitter Dataset" = 17))

p_twitter <- plot_data %>%
  filter(model == "Twitter Dataset") %>%
  ggplot(aes(x = x, y = predicted)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.15) +
  scale_y_continuous(limits = c(0.2, 0.7)) +
  labs(title = "Twitter Dataset", x = "Mandate Mode", y = "Predicted probability") +
  theme_bw() +
  theme(legend.position = "bottom") +
  common_color_scale +
  common_shape_scale +
  rremove("x.title")

p_facebook <- plot_data %>%
  filter(model == "Facebook Dataset") %>%
  ggplot(aes(x = x, y = predicted)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.15) +
  scale_y_continuous(limits = c(0.5, 0.85)) +
  labs(title = "Facebook Dataset", x = "Mandate Mode", y = "Predicted probability") +
  theme_bw() +
  theme(legend.position = "bottom") +
  common_color_scale +
  common_shape_scale +
  rremove("x.title")

# Side-by-side panels
figure <- ggpubr::ggarrange(p_twitter, p_facebook, align = "v", labels = NULL, common.legend = TRUE, legend = "bottom")
figure
ggsave("output/predicted_values_h0.png", dpi = 1200, width = 7, height = 3)


#######################################################################################################################################
#######################################################################################################################################
###################################################### Hypotheses #####################################################################
#######################################################################################################################################
#######################################################################################################################################


# H1: Direct Candidates are more likely to mention places closer to their constituency ----

#TW: without controls
tw_mod_10 <-lm(mean_distance ~ elected_via, df_analysis |> filter(data == "Twitter Dataset"))

#TW: with controls
tw_mod_11 <-lm(mean_distance ~ elected_via + incumbent+area_district +mean_place_density + list_no+ gender+birthyear, df_analysis |> filter(data == "Twitter Dataset"))

#FB: without controls
fb_mod_10 <-lm(mean_distance ~ elected_via, df_analysis |> filter(data == "Facebook Dataset"))
#FB: with controls
fb_mod_11 <-lm(mean_distance ~ elected_via+incumbent+area_district +mean_place_density + list_no+ gender+birthyear, df_analysis |> filter(data == "Facebook Dataset"))


# H2: Direct Candidates are more likely to mention places in constituencies with worse service access ----

# Twitter: constituency-level services

# All services (constituency-level) – without controls
tw_mod_20 <- lm(mean_service_dist_constituency ~ elected_via,
                df_analysis |> filter(data == "Twitter Dataset"))

# All services (constituency-level) – with controls
tw_mod_21 <- lm(mean_service_dist_constituency ~ elected_via + incumbent +
                  area_district + mean_place_density + list_no + gender + birthyear,
                df_analysis |> filter(data == "Twitter Dataset"))

# High-variance services (constituency-level) – with controls
tw_mod_22 <- lm(mean_hvar_service_dist_constituency ~ elected_via + incumbent +
                  area_district + mean_place_density + list_no + gender + birthyear,
                df_analysis |> filter(data == "Twitter Dataset"))

# Low-correlation services (constituency-level) – with controls
tw_mod_23 <- lm(mean_lowcorr_service_dist_constituency ~ elected_via + incumbent +
                  area_district + mean_place_density + list_no + gender + birthyear,
                df_analysis |> filter(data == "Twitter Dataset"))


# Facebook: constituency-level services

# All services (constituency-level) – without controls
fb_mod_20 <- lm(mean_service_dist_constituency ~ elected_via,
                df_analysis |> filter(data == "Facebook Dataset"))

# All services (constituency-level) – with controls
fb_mod_21 <- lm(mean_service_dist_constituency ~ elected_via + incumbent +
                  area_district + mean_place_density + list_no + gender + birthyear,
                df_analysis |> filter(data == "Facebook Dataset"))

# High-variance services (constituency-level) – with controls
fb_mod_22 <- lm(mean_hvar_service_dist_constituency ~ elected_via + incumbent +
                  area_district + mean_place_density + list_no + gender + birthyear,
                df_analysis |> filter(data == "Facebook Dataset"))

# Low-correlation services (constituency-level) – with controls
fb_mod_23 <- lm(mean_lowcorr_service_dist_constituency ~ elected_via + incumbent +
                  area_district + mean_place_density + list_no + gender + birthyear,
                df_analysis |> filter(data == "Facebook Dataset"))


tidy_model3 <- tidy(tw_mod_11)|> filter(term %in% terms)|> mutate(model = "Twitter Dataset",hypothesis = "H1: District Distance")
tidy_model4 <- tidy(fb_mod_11)|> filter(term %in% terms)|> mutate(model = "Facebook Dataset",hypothesis = "H1: District Distance")
tidy_model5 <- tidy(tw_mod_21) |> filter(term %in% terms) |> mutate(model = "Twitter Dataset", hypothesis = "H2: Service Deprivation")
tidy_model6 <- tidy(fb_mod_21) |> filter(term %in% terms) |> mutate(model = "Facebook Dataset", hypothesis = "H2: Service Deprivation")


tidy_models_hypotheses <- rbind(tidy_model3, tidy_model4, tidy_model5, tidy_model6) |> 
  mutate(term = case_when(
    term == "elected_viaList" ~ "Mandate Mode (List)",
    term == "list_no" ~ "List Number",
    term == "birthyear" ~ "Age",
    term == "incumbent" ~ "Incumbent",
    term == "area_district" ~ "District Size",
    term == "mean_place_density" ~ "Place Density",
    term == "incumbentNo Incumbent" ~ "Incumbent (No)",
    term == "gendermale" ~ "Gender (Male)",
    TRUE ~ term
  ))



dwplot(tidy_models_hypotheses, 
       dot_args = list( size = 3, shape = 18), 
       whisker_args = list(size = 1)) +
  geom_vline(xintercept = 0, linetype = "dashed") +
  facet_wrap(~hypothesis, scales = "free_x") +
  theme_bw() + 
  labs(x = "Estimate", y = "Terms") +
  theme(plot.title = element_text(face = "bold"),
        legend.position = "bottom",
        legend.background = element_rect(colour = "grey80"),
        legend.title = element_text(hjust = 0.5)) +
  scale_shape_discrete(name = "Data") +
  scale_colour_manual(name = "Data", 
                      values = c("Facebook Dataset" = "#1b9e77",   # green
                                 "Twitter Dataset" = "#d95f02"))   # orange




ggsave("output/coefficients.png", dpi = 1200, width = 7, height = 5)



# Robustness ----

## Direct Candidates are more likely to mention places that are lacking service access

# Twitter Models

#with controls 
tw_mod_31 <- lm(mean_service_dist ~ elected_via + incumbent + area_district + mean_place_density + list_no + gender + birthyear, df_analysis |> filter(data == "Twitter Dataset"))

# DV: high variance services
tw_mod_32 <- lm(mean_hvar_service_dist ~ elected_via + incumbent + area_district + mean_place_density + list_no + gender + birthyear, df_analysis |> filter(data == "Twitter Dataset"))

# DV: low correlation services
tw_mod_33 <- lm(mean_lowcorr_service_dist ~ elected_via + incumbent + area_district + mean_place_density + list_no + gender + birthyear, df_analysis |> filter(data == "Twitter Dataset"))


#with controls 
fb_mod_31 <- lm(mean_service_dist ~ elected_via + incumbent + area_district +mean_place_density +  list_no + gender + birthyear, df_analysis |> filter(data == "Facebook Dataset"))

#DV: high variance services
fb_mod_32 <- lm(mean_hvar_service_dist ~ elected_via + incumbent + area_district+ mean_place_density + list_no + gender + birthyear, df_analysis |> filter(data == "Facebook Dataset"))

#DV: DV: low correlation services
fb_mod_32 <- lm(mean_lowcorr_service_dist ~ elected_via + incumbent + area_district + mean_place_density  + list_no + gender + birthyear, df_analysis |> filter(data == "Facebook Dataset"))

##  Plots for Appendix ----

get_ci_table <- function(model, dataset, hypothesis, dv, controls) {
  
  # 95% CI
  tidy_95 <- broom::tidy(model, conf.int = TRUE, conf.level = 0.95) |>
    select(term,
           estimate,
           conf.low_95  = conf.low,
           conf.high_95 = conf.high)
  
  # 90% CI
  tidy_90 <- broom::tidy(model, conf.int = TRUE, conf.level = 0.90) |>
    select(term,
           conf.low_90  = conf.low,
           conf.high_90 = conf.high)
  
  # Join and add metadata
  left_join(tidy_95, tidy_90, by = "term") |>
    mutate(
      dataset    = dataset,
      hypothesis = hypothesis,
      dv         = dv,
      controls   = controls
    ) |>
    select(hypothesis, dv, dataset, controls,
           term, estimate,
           conf.low_95, conf.high_95,
           conf.low_90, conf.high_90)
}



models_info <- tribble(
  ~model_obj, ~dataset,           ~hypothesis, ~dv,                               ~controls,
  # H2 – Twitter
  tw_mod_31,  "Twitter Dataset",  "H2: Service Deprivation", "mean_service_dist",              "with controls",
  tw_mod_32,  "Twitter Dataset",  "H2: Service Deprivation", "mean_hvar_service_dist",         "with controls",
  tw_mod_33,  "Twitter Dataset",  "H2: Service Deprivation", "mean_lowcorr_service_dist",      "with controls",
  # H2 – Facebook
  fb_mod_31,  "Facebook Dataset", "H2: Service Deprivation", "mean_service_dist",              "with controls",
  fb_mod_32,  "Facebook Dataset", "H2: Service Deprivation", "mean_hvar_service_dist",         "with controls",
  fb_mod_32,  "Facebook Dataset", "H2: Service Deprivation", "mean_lowcorr_service_dist",      "with controls",  # as in your code
  
  # H3 – Twitter (constituency-level services)
  tw_mod_21,  "Twitter Dataset",  "H2: Constituency Service Deprivation", "mean_service_dist_constituency",         "with controls",
  tw_mod_22,  "Twitter Dataset",  "H2: Constituency Service Deprivation", "mean_hvar_service_dist_constituency",    "with controls",
  tw_mod_23,  "Twitter Dataset",  "H2: Constituency Service Deprivation", "mean_lowcorr_service_dist_constituency", "with controls",
  
  # H3 – Facebook (constituency-level services)
  fb_mod_21,  "Facebook Dataset", "H2: Constituency Service Deprivation", "mean_service_dist_constituency",         "with controls",
  fb_mod_22,  "Facebook Dataset", "H2: Constituency Service Deprivation", "mean_hvar_service_dist_constituency",    "with controls",
  fb_mod_23,  "Facebook Dataset", "H2: Constituency Service Deprivation", "mean_lowcorr_service_dist_constituency", "with controls"
)


ci_table_h2_h3 <- models_info |>
  mutate(
    tbl = pmap(
      list(model_obj, dataset, hypothesis, dv, controls),
      get_ci_table
    )
  ) |>
  pull(tbl) |>
  bind_rows()

ci_table_h2_h3_main <- ci_table_h2_h3 |>
  filter(term == "elected_viaList") |> 
  mutate(
    dv = dplyr::case_when(
      dv == "mean_service_dist" ~ "All services",
      dv == "mean_hvar_service_dist" ~ "Max variance\nservices",
      dv == "mean_lowcorr_service_dist" ~ "Min correlation\nservices",
      
      dv == "mean_service_dist_constituency" ~ "All services\n(constituency)",
      dv == "mean_hvar_service_dist_constituency" ~ "Max variance services\n(constituency)",
      dv == "mean_lowcorr_service_dist_constituency" ~ "Min correlation services\n(constituency)",
      
      TRUE ~ dv
    )
  )



  
ci_table_h2_h3_main$hypothesis <- factor(
  ci_table_h2_h3_main$hypothesis,
  levels = rev(levels(factor(ci_table_h2_h3_main$hypothesis)))
)


ggplot(ci_table_h2_h3_main,
       aes(y = dv, x = estimate, colour = dataset)) +
  
  # Zero line
  geom_vline(xintercept = 0, linetype = "dashed", colour = "grey60") +
  
  # 95% CI (thin)
  geom_errorbarh(
    aes(xmin = conf.low_95, xmax = conf.high_95),
    height = 0.30,
    alpha  = 0.6
  ) +
  
  # 90% CI (thick)
  geom_errorbarh(
    aes(xmin = conf.low_90, xmax = conf.high_90),
    height = 0.15,
    linewidth = 1
  ) +
  
  geom_point(size = 2) +
  
  facet_grid(hypothesis ~ dataset, scales = "free_y") +
  scale_colour_manual(
    name = "Data",
    values = c(
      "Facebook Dataset" = "#1b9e77",  # green
      "Twitter Dataset"  = "#d95f02"   # orange
    )
  ) +
  labs(
    x = "Coefficient estimate for elected_viaList",
    y = NULL,
    colour = "Dataset",
    title = "Effect of list election status (elected_viaList)",
    subtitle = "Dots = estimate, thick bars = 90% CI, thin bars = 95% CI"
  ) +
  
  theme_minimal() +
  theme(
    panel.border = element_rect(colour = "black", fill = NA, linewidth = 0.6),
    panel.spacing = unit(1.2, "lines")
  )


ggsave("output/robustness_checks.png", width = 10, height = 8, dpi = 700)





